In [ ]:
##################################################################################################
## Notebook used for extracting text from html files. Some basic preprocessing tasks
## v1.0 Reading text using BeautifulSoup
## Required Packages: os, BeautifulSoup
## The html files are not included in the repository
## They can be downloaded form the following link
## https://www.rbi.org.in/scripts/SearchResults.aspx?search=rajan&sp=speeches
##################################################################################################
In [ ]:
import os
from bs4 import BeautifulSoup as bs
In [ ]:
## Reading all the html files in the directory
##
rootDir = 'E:\\NLP Session\\RBIGovernorSpeeches\\'
htmlFiles = [f for f in os.listdir(rootDir) if f.endswith('.html')]
htmlFiles
In [ ]:
## Selecting the first html file in the set
##
fileName = rootDir + htmlFiles[0]
print fileName
In [ ]:
## Opening the file and converting it to a 'soup' object
soup = bs(open(fileName), 'html.parser')
soup
In [ ]:
## Prints out a pretty version of the soup
##
print(soup.prettify())
In [ ]:
## Prints the title in the page
##
soup.title
In [ ]:
## Prints the title in the page, and extracts out the string
##
soup.title.string
In [ ]:
## Prints the first paragraph in the page
##
soup.p
In [ ]:
## Prints the first paragraph in the page and extracts the string
##
soup.p.string
In [ ]:
## Prints out all the links in the webpage
##
for link in soup.find_all('a'):
print(link.get('href'))
In [ ]:
## Extracts the text from the soup object
##
print(soup.get_text())
In [ ]:
soup = bs(open(fileName), 'html.parser') # Parses text so that html tags can be extracted
# Removes the styling and other information
for script in soup(["script", "style","title",'[document]', 'head', 'title']):
script.extract()
# Extracts the text from the soup
cleaned=str(soup.get_text(separator=' ').encode('ascii','ignore'))
# Strips out the spaces
cleanedtext = cleaned.strip()
cleanedtext